df <- read_csv("Provisional_COVID-19_Deaths_by_Place_of_Death_and_Age.csv")
## Rows: 104976 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Data as of, Start Date, End Date, Group, State, Place of Death, Age...
## dbl (9): Year, Month, HHS Region, COVID-19 Deaths, Total Deaths, Pneumonia D...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tot_population <- read.csv("tot_population.csv")
tot_population$Population<-as.numeric(gsub(" ", "", tot_population$Population)) # Converting numbers to numeric format
## Construct dictionary of neighboring states for each state
#source from: https://state.1keydata.com/bordering-states-list.php
#We assume that water borders and land borders have no difference
neigh=list(
"Alabama"=c('Florida', 'Georgia', 'Mississippi', 'Tennessee'),
"Arizona"=c('California', 'Colorado', 'Nevada', 'New Mexico', 'Utah'),
"Arkansas"=c('Louisiana', 'Mississippi', 'Missouri', 'Oklahoma', 'Tennessee', 'Texas'),
"California"=c('Arizona', 'Nevada', 'Oregon'),
"Colorado"=c('Arizona', 'Kansas', 'Nebraska', 'New Mexico', 'Oklahoma', 'Utah', 'Wyoming'),
"Connecticut"=c('Massachusetts', 'New York', 'Rhode Island'),
"Delaware"=c('Maryland', 'New Jersey', 'Pennsylvania'),
"Florida"=c('Alabama', 'Georgia'),
"Georgia"=c('Alabama', 'Florida', 'North Carolina', 'South Carolina', 'Tennessee'),
"Idaho"=c('Montana', 'Nevada', 'Oregon', 'Utah', 'Washington', 'Wyoming'),
"Illinois"=c('Indiana', 'Iowa', 'Michigan', 'Kentucky', 'Missouri', 'Wisconsin'),
"Indiana"=c('Illinois', 'Kentucky', 'Michigan', 'Ohio'),
"Iowa"=c('Illinois', 'Minnesota', 'Missouri', 'Nebraska', 'South Dakota', 'Wisconsin'),
"Kansas"=c('Colorado', 'Missouri', 'Nebraska', 'Oklahoma'),
"Kentucky"=c('Illinois', 'Indiana', 'Missouri', 'Ohio', 'Tennessee', 'Virginia', 'West Virginia'),
"Louisiana"=c('Arkansas', 'Mississippi', 'Texas'),
"Maine"=c('New Hampshire'),
"Maryland"=c('Delaware', 'Pennsylvania', 'Virginia', 'West Virginia'),
"Massachusetts"=c('Connecticut', 'New Hampshire', 'New York', 'Rhode Island', 'Vermont'),
"Michigan"=c('Illinois', 'Indiana', 'Minnesota', 'Ohio', 'Wisconsin'),
"Minnesota"=c('Iowa', 'Michigan', 'North Dakota', 'South Dakota', 'Wisconsin'),
"Mississippi"=c('Alabama', 'Arkansas', 'Louisiana', 'Tennessee'),
"Missouri"=c('Arkansas', 'Illinois', 'Iowa', 'Kansas', 'Kentucky', 'Nebraska', 'Oklahoma', 'Tennessee'),
"Montana"=c('Idaho', 'North Dakota', 'South Dakota', 'Wyoming'),
"Nebraska"=c('Colorado', 'Iowa', 'Kansas', 'Missouri', 'South Dakota', 'Wyoming'),
"Nevada"=c('Arizona', 'California', 'Idaho', 'Oregon', 'Utah'),
"New Hampshire"=c('Maine', 'Massachusetts', 'Vermont'),
"New Jersey"=c('Delaware', 'New York', 'Pennsylvania'),
"New Mexico"=c('Arizona', 'Colorado', 'Oklahoma', 'Texas', 'Utah'),
"New York"=c('Connecticut', 'Massachusetts', 'New Jersey', 'Pennsylvania', 'Rhode Island', 'Vermont'),
"North Carolina"=c('Georgia', 'South Carolina', 'Tennessee', 'Virginia'),
"North Dakota"=c('Minnesota', 'Montana', 'South Dakota'),
"Ohio"=c('Indiana', 'Kentucky', 'Michigan', 'Pennsylvania', 'West Virginia'),
"Oklahoma"=c('Arkansas', 'Colorado', 'Kansas', 'Missouri', 'New Mexico', 'Texas'),
"Oregon"=c('California', 'Idaho', 'Nevada', 'Washington'),
"Pennsylvania"=c('Delaware', 'Maryland', 'New Jersey', 'New York', 'Ohio', 'West Virginia'),
"Rhode Island"=c('Connecticut', 'Massachusetts', 'New York'),
"South Carolina"=c('Georgia', 'North Carolina'),
"South Dakota"=c('Iowa', 'Minnesota', 'Montana', 'Nebraska', 'North Dakota', 'Wyoming'),
"Tennessee"=c('Alabama', 'Arkansas', 'Georgia', 'Kentucky', 'Mississippi', 'Missouri', 'North Carolina', 'Virginia'),
"Texas"=c('Arkansas', 'Louisiana', 'New Mexico', 'Oklahoma'),
"Utah"=c('Arizona', 'Colorado', 'Idaho', 'Nevada', 'New Mexico', 'Wyoming'),
"Vermont"=c('Massachusetts', 'New Hampshire', 'New York'),
"Virginia"=c('Kentucky', 'Maryland', 'North Carolina', 'Tennessee', 'West Virginia'),
"Washington"=c('Idaho', 'Oregon'),
"West Virginia"=c('Kentucky', 'Maryland', 'Ohio', 'Pennsylvania', 'Virginia'),
"Wisconsin"=c('Illinois', 'Iowa', 'Michigan', 'Minnesota'),
"Wyoming"=c('Colorado', 'Idaho', 'Montana', 'Nebraska', 'South Dakota', 'Utah')
)
states<-setdiff(unique(df$State),c("United States","New York City","District of Columbia", "Puerto Rico", "Alaska", "Hawaii") )
df.deaths <- data.frame(rep(0,21))
for(i in states) {
new.cases<-filter(df,Group=='By Month',State==i,`Place of Death`=='Total - All Places of Death',`Age group`=='All Ages')
df.deaths[,which(states == i)]<-new.cases$`COVID-19 Deaths`*100000/tot_population[tot_population[,1]==state.abb[match(i,state.name)],2]
# Checking that the total cases matches the sum of new cases
c1<-sum(new.cases$`COVID-19 Deaths`, na.rm = TRUE)
total<-filter(df,Group=="By Total",State==i,`Place of Death`=='Total - All Places of Death',`Age group`=='All Ages')
c2 <- total$`COVID-19 Deaths`
plot(as.Date(new.cases$`End Date`, "%m/%d/%Y"),new.cases$`COVID-19 Deaths`, 'h', xaxt = "n", xlab='Month', ylab='Montly new cases of death',main=paste(i,paste('TOT1 =',as.character(c1)), paste('TOT2 =', as.character(c2)), sep=', '))
axis.Date(1, at = seq(min(as.Date(new.cases$`End Date`, "%m/%d/%Y")), max(as.Date(new.cases$`End Date`, "%m/%d/%Y"))+6, "months"),format='%m-%y')
}
colnames(df.deaths)<- c(states)
for(i in states) {
# Dataframe with neighbours of i
df.neigh.i<-filter(df,Group=='By Month', is.element(State, unlist(neigh[i])),`Place of Death`=='Total - All Places of Death',`Age group`=='All Ages')
# Concatenated vector with deaths of all neighbours of state i
deaths_neigh_i<-df.neigh.i$`COVID-19 Deaths`
# Concatenated vector with population of all neighbours of state i
population_neigh_i<-rep(NA,nrow(df.neigh.i))
for (k in 1:nrow(df.neigh.i)) {
population_neigh_i[k]<-tot_population[tot_population[,1]==state.abb[match(df.neigh.i$State[k],state.name)],2]
}
# Concatenated vector with normalized deaths of all neighbours of state i
x<-deaths_neigh_i*100000/population_neigh_i
# Concatenated vector in which normalized deaths in state i are repeated for each neighbour
y<-rep(filter(df,Group=='By Month',State==i,`Place of Death`=='Total - All Places of Death',`Age group`=='All Ages')$`COVID-19 Deaths`, length(unlist(neigh[i])))*100000/tot_population[tot_population[,1]==state.abb[match(i,state.name)],2]
# Linear prediction and plot
fit<-lm(y ~ x)
plot(x,y,'p',main=paste(i, ':', 'b0 =', toString(round(fit$coefficients[1],1)), ',', 'b1 =', toString(round(fit$coefficients[2],2))))
}
In this investigation, we set out to answer the following questions. 1. How have the number of deaths due to COVID-19 in the United State evolved with time since the beginning of the pandemic? 2. Next, we investigate how COVID-19 deaths are distributed across some notable risk factors. In particular, we ask deaths are distributed across age groups? How are deaths distributed across the recorded location of death (meaning type of institution or private residence)? 3. How has the mortality of COVID-19 evolved with time? 4. How are COVID-19 cases correlated across neighbouring states? 5. Can we predict the number of deaths in a given state and given month from the number of deaths in the k-th degree neighbor of that state?
In this section, we propose and implement a linear model to predict the number of monthly COVID-19 deaths in a given state as a function of the number of deaths during the same month in a k-th degree neighbor of that state. Two states X and Y are k-th degree neighbors if at least k interstate borders must be crossed to go from one state to the other.
\[ E[N_X\mid N_Y] = b_0^{(k_{XY})}+ b_1^{(k_{XY})} N_Y \]
For example, the above model can be used to predict the number of COVID-19 deaths in Massachusetts during a month given the number of deaths in Vermont during the same month, where Vermont and Massachusetts share a border and are thus 1st degree neighbors (\(k=1\)). The panel of figures below contains scatterplots of \(N_X\) against \(N_Y\) for any month and any two states that are neighbors of degree \(d\). In order to find the degree of neighborship between two states, we have treated the states as nodes on a graph, where edges correspond to borders between states. We constructed an adjacency matrix between states sharing a border, and subsequently we used the Floyd–Warshall algorithm to find the smallest number of interstate borders connecting any two states. In each panel, we have indicated the parameters \(b_0\), \(b_1\) and the correlation coefficient \(\rho\) between \(N_X\) and \(N_Y\).
Next, we have plotted how the parameters \(b_0,b_1\) and \(\rho\) vary as a function of degree of separation \(d\) below:
As expected, the predictive power of a neighbor decreases with increasing degree of the neighbor. We see that the variation in the number of deaths is large, and thus it is (unsurprisingly) hard to make accurate predictions of the number of deaths. The speed of attenuation of response \(b_1\) with \(d\) nonetheless sheds light on the spatial scale of the correlations between the number of deaths.